#include "opencl_source_map.hpp" 
namespace MNN { 
#ifndef MNN_OPENCL_BUFFER_CLOSED
const char* gemv_conv1x1_buf = 
"#ifdef MNN_SUPPORT_FP16\n"
"#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"
"#endif\n"
"__constant sampler_t SAMPLER=CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n"
"#define GLOBAL_SIZE_DIM_2 "" __private int global_size_dim0,__private int global_size_dim1,\n"
"#define GLOBAL_SIZE_DIM_3 "" __private int global_size_dim0,__private int global_size_dim1,__private int global_size_dim2,\n"
"#define UNIFORM_BOUNDRY_CHECK_2(index0, index1) "" if(index0 >= global_size_dim0 || index1 >= global_size_dim1) { "" return; "" }\n"
"#define UCHAR4_TO_CHAR8(b, scale, offset) "" wei.s0 = (COMPUTE_FLOAT)((b.s0 >> 4) - 8); "" wei.s1 = (COMPUTE_FLOAT)((b.s0 & 15) - 8); "" wei.s2 = (COMPUTE_FLOAT)((b.s1 >> 4) - 8); "" wei.s3 = (COMPUTE_FLOAT)((b.s1 & 15) - 8); "" wei.s4 = (COMPUTE_FLOAT)((b.s2 >> 4) - 8); "" wei.s5 = (COMPUTE_FLOAT)((b.s2 & 15) - 8); "" wei.s6 = (COMPUTE_FLOAT)((b.s3 >> 4) - 8); "" wei.s7 = (COMPUTE_FLOAT)((b.s3 & 15) - 8); "" wei=wei*scale+offset;\n"
"#if WGS >= 8\n"
"__kernel void gemv_conv_c8_buf(GLOBAL_SIZE_DIM_3\n"
" __global const FLOAT* input,\n"
"#ifdef USE_IMAGE\n"
" __read_only image2d_t weight,\n"
"#else\n"
" #if QUANT_BIT == 8\n"
" __global const char *weight,\n"
" #else\n"
" __global const uchar *weight,\n"
" #endif\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT* output,\n"
" __private const int dstChannelAlign,\n"
" __private const int srcChannelAlign,\n"
" __private const int dstChannelC4,\n"
" __private const int srcChannelC4,\n"
" __private const int srcChannel,\n"
" __private const int blockNum,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int lid=get_local_id(0);\n"
" const int oc=get_global_id(1); //oc/8\n"
" const int oc8=oc << 3;\n"
" \n"
"#if QUANT_BIT == 8\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" const int loop=max((srcChannel+2-1)/2-1,0);\n"
" #else\n"
" const int loop=(srcChannel+2-1)/2;\n"
" #endif\n"
" #ifndef USE_IMAGE\n"
" const int weight_offset=oc*srcChannelC4*32;\n"
" #endif\n"
"#else\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" const int loop=max((srcChannel+4-1)/4-1,0);\n"
" #else\n"
" const int loop=(srcChannel+4-1)/4;\n"
" #endif\n"
" #ifndef USE_IMAGE\n"
" const int weight_offset=oc*srcChannelC4*16;\n"
" #endif\n"
"#endif\n"
" COMPUTE_FLOAT8 out0=0;\n"
" int input_offset=0,output_offset=oc8;\n"
" __local COMPUTE_FLOAT8 sum0[WGS];\n"
"#ifdef COMPUTE_BATCH\n"
" const int out_b_idx=get_global_id(2) << 2; //b/4\n"
" __local COMPUTE_FLOAT8 sum1[WGS];\n"
" __local COMPUTE_FLOAT8 sum2[WGS];\n"
" __local COMPUTE_FLOAT8 sum3[WGS];\n"
" COMPUTE_FLOAT8 out1=0,out2=0,out3=0;\n"
" input_offset=out_b_idx*srcChannelAlign;\n"
" output_offset=oc8+out_b_idx*dstChannelAlign;\n"
"#endif\n"
" for(int j=lid; j<loop; j+=WGS){\n"
" #if QUANT_BIT == 8\n"
" int k2=j << 1;\n"
" COMPUTE_FLOAT16 scale,offset;\n"
" {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+oc8*2+(k2/blockDim)*dstChannelC4*8))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset.s02468ace,scaleOffset.s02468ace);\n"
" offset=(COMPUTE_FLOAT16)(scaleOffset.s13579bdf,scaleOffset.s13579bdf);\n"
" #else\n"
" COMPUTE_FLOAT8 scaleOffset=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+oc8+(k2/blockDim)*dstChannelC4*4))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset,scaleOffset);\n"
" offset=0;\n"
" #endif\n"
" }\n"
" COMPUTE_FLOAT2 in=CONVERT_COMPUTE_FLOAT2(vload2(0,input+input_offset+k2));\n"
" #ifdef COMPUTE_BATCH\n"
" COMPUTE_FLOAT2 in1=CONVERT_COMPUTE_FLOAT2(vload2(0,input+input_offset+srcChannelAlign+k2));\n"
" COMPUTE_FLOAT2 in2=CONVERT_COMPUTE_FLOAT2(vload2(0,input+input_offset+srcChannelAlign*2+k2));\n"
" COMPUTE_FLOAT2 in3=CONVERT_COMPUTE_FLOAT2(vload2(0,input+input_offset+srcChannelAlign*3+k2));\n"
" #endif\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(j,oc))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(vload16(j,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei.s01234567,out0);\n"
" #ifdef COMPUTE_BATCH\n"
" out1=mad((COMPUTE_FLOAT8)in1.s0,wei.s01234567,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in2.s0,wei.s01234567,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in3.s0,wei.s01234567,out3);\n"
" #endif\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei.s89abcdef,out0);\n"
" #ifdef COMPUTE_BATCH\n"
" out1=mad((COMPUTE_FLOAT8)in1.s1,wei.s89abcdef,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in2.s1,wei.s89abcdef,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in3.s1,wei.s89abcdef,out3);\n"
" #endif\n"
" }\n"
" #else\n"
" int k4=j << 2;\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 scale,offset;\n"
" {\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+oc8*2+(k4/blockDim)*dstChannelC4*8))/coef);\n"
" scale=scaleOffset.s02468ace;\n"
" offset=scaleOffset.s13579bdf;\n"
" }\n"
" #else\n"
" COMPUTE_FLOAT8 scale=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+oc8+(k4/blockDim)*dstChannelC4*4))/coef);\n"
" COMPUTE_FLOAT8 offset=0;\n"
" #endif\n"
" COMPUTE_FLOAT8 wei;\n"
" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+k4+input_offset));\n"
" #ifdef COMPUTE_BATCH\n"
" COMPUTE_FLOAT4 in1=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+srcChannelAlign+k4));\n"
" COMPUTE_FLOAT4 in2=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+srcChannelAlign*2+k4));\n"
" COMPUTE_FLOAT4 in3=CONVERT_COMPUTE_FLOAT4(vload4(0,input+input_offset+srcChannelAlign*3+k4));\n"
" #endif\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(j,oc)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(j,weight+weight_offset);\n"
" #endif\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei,out0);\n"
" #ifdef COMPUTE_BATCH\n"
" out1=mad((COMPUTE_FLOAT8)in1.s0,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in2.s0,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in3.s0,wei,out3);\n"
" #endif\n"
" }\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei,out0);\n"
" #ifdef COMPUTE_BATCH\n"
" out1=mad((COMPUTE_FLOAT8)in1.s1,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in2.s1,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in3.s1,wei,out3);\n"
" #endif\n"
" }\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s2,wei,out0);\n"
" #ifdef COMPUTE_BATCH\n"
" out1=mad((COMPUTE_FLOAT8)in1.s2,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in2.s2,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in3.s2,wei,out3);\n"
" #endif\n"
" }\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.scdef,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s3,wei,out0);\n"
" #ifdef COMPUTE_BATCH\n"
" out1=mad((COMPUTE_FLOAT8)in1.s3,wei,out1);\n"
" out2=mad((COMPUTE_FLOAT8)in2.s3,wei,out2);\n"
" out3=mad((COMPUTE_FLOAT8)in3.s3,wei,out3);\n"
" #endif\n"
" }\n"
" #endif\n"
" }\n"
"#if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" {\n"
" #if QUANT_BIT == 8\n"
" int k2=loop << 1;\n"
" COMPUTE_FLOAT16 scale,offset;\n"
" {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+oc8*2+(k2/blockDim)*dstChannelC4*8))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset.s02468ace,scaleOffset.s02468ace);\n"
" offset=(COMPUTE_FLOAT16)(scaleOffset.s13579bdf,scaleOffset.s13579bdf);\n"
" #else\n"
" COMPUTE_FLOAT8 scaleOffset=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+oc8+(k2/blockDim)*dstChannelC4*4))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset,scaleOffset);\n"
" offset=0;\n"
" #endif\n"
" }\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(loop,oc))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(vload16(loop,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)input[k2],wei.s01234567,out0);\n"
" }\n"
" #else\n"
" int k4=loop << 2;\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 scale,offset;\n"
" {\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+oc8*2+(k4/blockDim)*dstChannelC4*8))/coef);\n"
" scale=scaleOffset.s02468ace;\n"
" offset=scaleOffset.s13579bdf;\n"
" }\n"
" #else\n"
" COMPUTE_FLOAT8 scale=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+oc8+(k4/blockDim)*dstChannelC4*4))/coef);\n"
" COMPUTE_FLOAT8 offset=0;\n"
" #endif\n"
" COMPUTE_FLOAT8 wei;\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(loop,oc)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(loop,weight+weight_offset);\n"
" #endif\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)input[k4],wei,out0);\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 2\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)input[k4+1],wei,out0);\n"
" }\n"
" #endif\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 3\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)input[k4+2],wei,out0);\n"
" }\n"
" #endif\n"
" #endif\n"
" }\n"
"#endif\n"
" sum0[lid]=out0;\n"
" #ifdef COMPUTE_BATCH\n"
" sum1[lid]=out1; sum2[lid]=out2; sum3[lid]=out3;\n"
" #endif\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" for(int i=WGS/2; i>0; i /= 2){\n"
" if (lid<i){\n"
" sum0[lid]=sum0[lid]+sum0[lid+i];\n"
" #ifdef COMPUTE_BATCH\n"
" sum1[lid]=sum1[lid]+sum1[lid+i];\n"
" sum2[lid]=sum2[lid]+sum2[lid+i];\n"
" sum3[lid]=sum3[lid]+sum3[lid+i];\n"
" #endif\n"
" }\n"
" barrier(CLK_LOCAL_MEM_FENCE);\n"
" }\n"
" if(lid == 0){\n"
" COMPUTE_FLOAT8 vBias=CONVERT_COMPUTE_FLOAT8(vload8(0,bias+oc8));\n"
" out0=sum0[0]+vBias;\n"
" #ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT8)0);\n"
" #endif\n"
" #ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" #endif\n"
" #ifdef OUTPUT_CHANNEL_LEAVES\n"
" vstore4(CONVERT_FLOAT4(out0.s0123),0,output+output_offset);\n"
" if(oc8+4<dstChannelC4*4)\n"
" vstore4(CONVERT_FLOAT4(out0.s4567),0,output+4+output_offset);\n"
" #else\n"
" vstore8(CONVERT_FLOAT8(out0),0,output+output_offset);\n"
" #endif\n"
" #ifdef COMPUTE_BATCH\n"
" out1=sum1[0]+vBias; out2=sum2[0]+vBias; out3=sum3[0]+vBias;\n"
" #ifdef RELU\n"
" out1=fmax(out1,(COMPUTE_FLOAT8)0);out2=fmax(out2,(COMPUTE_FLOAT8)0);out3=fmax(out3,(COMPUTE_FLOAT8)0);\n"
" #endif\n"
" #ifdef RELU6\n"
" out1=clamp(out1,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);out2=clamp(out2,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);out3=clamp(out3,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
" #endif\n"
" vstore8(CONVERT_FLOAT8(out1),0,output+output_offset+dstChannelAlign);\n"
" vstore8(CONVERT_FLOAT8(out2),0,output+output_offset+dstChannelAlign+dstChannelAlign);\n"
" vstore8(CONVERT_FLOAT8(out3),0,output+output_offset+dstChannelAlign+dstChannelAlign+dstChannelAlign);\n"
" #endif\n"
" }\n"
"}\n"
"#else\n"
"__kernel void gemv_conv_c8_buf(GLOBAL_SIZE_DIM_3\n"
" __global const FLOAT* input,\n"
"#ifdef USE_IMAGE\n"
" __read_only image2d_t weight,\n"
"#else\n"
" #if QUANT_BIT == 8\n"
" __global const char *weight,\n"
" #else\n"
" __global const uchar *weight,\n"
" #endif\n"
"#endif\n"
" __global const FLOAT *dequantScaleOffset,\n"
" __global const FLOAT *bias,\n"
" __global FLOAT* output,\n"
" __private const int dstChannelAlign,\n"
" __private const int srcChannelAlign,\n"
" __private const int dstChannelC4,\n"
" __private const int srcChannelC4,\n"
" __private const int srcChannel,\n"
" __private const int blockNum,\n"
" __private const int blockDim,\n"
" __private const float coef) {\n"
" const int ic=get_global_id(0);\n"
" const int oc=get_global_id(1); //oc/8\n"
" \n"
" UNIFORM_BOUNDRY_CHECK_2(ic,oc);\n"
" const int oc8=oc << 3;\n"
"#if QUANT_BIT == 8\n"
" const int loop=(blockDim+2-1)/2;\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" const int loop_end=max(loop-1,0);\n"
" #else\n"
" const int loop_end=loop;\n"
" #endif\n"
" #ifndef USE_IMAGE\n"
" const int weight_offset=oc*srcChannelC4*32;\n"
" #endif\n"
"#else\n"
" const int loop=(blockDim+4-1)/4;\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" const int loop_end=max(loop-1,0);\n"
" #else\n"
" const int loop_end=loop;\n"
" #endif\n"
" #ifndef USE_IMAGE\n"
" const int weight_offset=oc*srcChannelC4*16;\n"
" #endif\n"
"#endif\n"
" COMPUTE_FLOAT8 out0=CONVERT_COMPUTE_FLOAT8(vload8(0,bias+oc8));\n"
" for (int i=0; i<blockNum; i++){\n"
" #if QUANT_BIT == 8\n"
" COMPUTE_FLOAT16 scale,offset;\n"
" {\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+oc8*2+i*dstChannelC4*8))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset.s02468ace,scaleOffset.s02468ace);\n"
" offset=(COMPUTE_FLOAT16)(scaleOffset.s13579bdf,scaleOffset.s13579bdf);\n"
" #else\n"
" COMPUTE_FLOAT8 scaleOffset=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+oc8+i*dstChannelC4*4))/coef);\n"
" scale=(COMPUTE_FLOAT16)(scaleOffset,scaleOffset);\n"
" offset=0;\n"
" #endif\n"
" }\n"
" for (int j=0; j<loop_end; j++) {\n"
" int k=i*loop+j;\n"
" COMPUTE_FLOAT2 in=CONVERT_COMPUTE_FLOAT2(vload2(0,input+(k << 1)));\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k,oc))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(vload16(k,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei.s01234567,out0);\n"
" }\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei.s89abcdef,out0);\n"
" }\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" {\n"
" int k=i*loop+loop_end;\n"
" #ifdef USE_IMAGE\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(as_char16(read_imagei(weight,SAMPLER,(int2)(k,oc))))*scale+offset;\n"
" #else\n"
" COMPUTE_FLOAT16 wei=CONVERT_COMPUTE_FLOAT16(vload16(k,weight+weight_offset))*scale+offset;\n"
" #endif\n"
" {\n"
" out0=mad((COMPUTE_FLOAT8)input[k << 1],wei.s01234567,out0);\n"
" }\n"
" }\n"
" #endif\n"
" #else\n"
" #ifdef ASYMMETRIC\n"
" COMPUTE_FLOAT8 scale,offset;\n"
" {\n"
" COMPUTE_FLOAT16 scaleOffset=CONVERT_COMPUTE_FLOAT16(convert_float16(vload16(0,dequantScaleOffset+oc8*2+i*dstChannelC4*8))/coef);\n"
" scale=scaleOffset.s02468ace;\n"
" offset=scaleOffset.s13579bdf;\n"
" }\n"
" #else\n"
" COMPUTE_FLOAT8 scale=CONVERT_COMPUTE_FLOAT8(convert_float8(vload8(0,dequantScaleOffset+oc8+i*dstChannelC4*4))/coef);\n"
" COMPUTE_FLOAT8 offset=0;\n"
" #endif\n"
" for (int j=0; j<loop_end; j++) {\n"
" int k=i*loop+j;\n"
" COMPUTE_FLOAT8 wei;\n"
" COMPUTE_FLOAT4 in=CONVERT_COMPUTE_FLOAT4(vload4(0,input+(k << 2)));\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(k,oc)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(k,weight+weight_offset);\n"
" #endif\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s0,wei,out0);\n"
" }\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s1,wei,out0);\n"
" }\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s2,wei,out0);\n"
" }\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.scdef,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)in.s3,wei,out0);\n"
" }\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM != 0\n"
" {\n"
" int k=i*loop+loop_end;\n"
" int k4=k << 2;\n"
" COMPUTE_FLOAT8 wei;\n"
" #ifdef USE_IMAGE\n"
" uchar16 charWeightsInt40=as_uchar16(read_imagei(weight,SAMPLER,(int2)(k,oc)));\n"
" #else\n"
" uchar16 charWeightsInt40=vload16(k,weight+weight_offset);\n"
" #endif\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s0123,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)input[k4],wei,out0);\n"
" }\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 2\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s4567,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)input[k4+1],wei,out0);\n"
" }\n"
" #endif\n"
" #if INPUT_CHANNEL_LEAVES_NUM >= 3\n"
" {\n"
" UCHAR4_TO_CHAR8(charWeightsInt40.s89ab,scale,offset);\n"
" out0=mad((COMPUTE_FLOAT8)input[k4+2],wei,out0);\n"
" }\n"
" #endif\n"
" }\n"
" #endif\n"
" #endif\n"
" }\n"
"#ifdef RELU\n"
" out0=fmax(out0,(COMPUTE_FLOAT8)0);\n"
"#endif\n"
"#ifdef RELU6\n"
" out0=clamp(out0,(COMPUTE_FLOAT8)0,(COMPUTE_FLOAT8)6);\n"
"#endif\n"
" #ifdef OUTPUT_CHANNEL_LEAVES\n"
" vstore4(CONVERT_FLOAT4(out0.s0123),0,output+oc8);\n"
" if(oc8+4<dstChannelC4*4)\n"
" vstore4(CONVERT_FLOAT4(out0.s4567),0,output+oc8+4);\n"
" #else\n"
" vstore8(CONVERT_FLOAT8(out0),0,output+oc8);\n"
" #endif\n"
"}\n"
"#endif\n"
;
#endif
}
